MAE augmentation (primary and secondary outcome definitions)

Author

Laura Symul, Laura Vermeren

Published

January 9, 2026

Code
library(tidyverse)
library(magrittr)
library(gt)
library(patchwork)
library(SummarizedExperiment)
library(tidySummarizedExperiment)
library(MultiAssayExperiment)

tmp <- fs::dir_map("R scripts/", source)
tmp <- fs::dir_map("../VIBRANT-99-utils/R/", source)
rm(tmp)
theme_set(theme_light())

The primary purpose of this document is to define and create additional assays to the MultiAssayExperiment object that contain the primary and secondary outcomes of the VIBRANT trial.

Loading the MultiAssayExperiment object

Code
mae_files <- 
  fs::dir_ls(
    get_02_output_dir(),
    regexp = "mae_full_.*\\.rds$"
  ) |> 
  sort(decreasing = TRUE) |>
  magrittr::extract(1)

mae <- readRDS(mae_files)

rm(mae_files)

qPCR data

Comparison of the total 16S in weekly vs daily samples

Code
qpcr_16S <- 
  bind_rows(
    mae[["qPCR_daily"]] |> 
      as_tibble() |>
      dplyr::filter(.feature == "16S", !is.na(copies_per_swab_med)) |>
      select(uid, dilution, copies_per_swab_med) |> 
      mutate(assay = "daily qPCR") |> 
      expand_grid(calibration = c("orig", "LL3", "LM")),
    mae[["qPCR"]] |> 
      as_tibble() |>
      dplyr::filter(.feature == "16S") |>
      select(uid, dilution, copies_per_swab_med, copies_per_swab_orig_med, copies_per_swab_LM_med) |> 
      pivot_longer(
        cols = c(copies_per_swab_med, copies_per_swab_orig_med, copies_per_swab_LM_med),
        names_to = "calibration",
        names_pattern = "copies_per_swab_(.*)_med",
        values_to = "copies_per_swab_med"
      ) |>
      mutate(
        calibration = calibration |> str_replace_na("LL3"),
        assay = "weekly qPCR"
        )
  ) |> 
  dplyr::left_join(
    mae@colData |> 
      as_tibble() |> 
      dplyr::select(
        uid, pid, visit_code, visit, study_day, sample_type, 
        site, randomized, arm, ITT, mITT, PP),
    by = join_by(uid)
    ) |>
  dplyr::filter(sample_type == "Clinical sample") |> 
  mutate(calibration = calibration |> fct_inorder())
Code
qpcr_16S |>
  filter(randomized, study_day > -10) |> 
  ggplot() +
  aes(x = study_day, y = copies_per_swab_med, color = assay) +
  facet_grid(site ~ calibration, 
    scales = "free_y", space = "free_y", labeller = label_both
    ) +
  geom_line(aes(group = interaction(pid, assay)), alpha = 0.2) +
  geom_point(alpha = 0.5, size = 0.5) +
  scale_y_log10() +
  theme(
    strip.text.y = element_text(angle = 0)
  )

Code
selected_time_points <- 
  qpcr_16S |> 
  filter(randomized) |> 
  group_by(pid, study_day) |> 
  mutate(n_assays = n_distinct(assay)) |>
  ungroup() |> 
  dplyr::filter(n_assays == 2) 

tmp <- 
 dplyr::full_join(
   selected_time_points |> 
     filter(assay == "weekly qPCR") |> 
     dplyr::select(calibration, copies_per_swab_med, pid, study_day, site) |> 
     dplyr::rename(copies_per_swab_med_weekly = copies_per_swab_med),
   selected_time_points |> 
     filter(assay == "daily qPCR")  |> 
     dplyr::select(calibration, copies_per_swab_med, pid, study_day, site) |> 
     distinct() |> 
     dplyr::rename(copies_per_swab_med_daily = copies_per_swab_med)
 )
Code
tmp |> 
  ggplot() +
  aes(x = copies_per_swab_med_daily, y = copies_per_swab_med_weekly, col = calibration) +
  facet_grid(
    site ~ calibration, labeller = label_both
    ) +
  geom_abline(slope = 1, intercept = 0, color = "black") +
  geom_point(alpha = 0.5, size = 0.5) +
  scale_x_log10("16S copies/swabs - Daily swabs") +
  scale_y_log10("16S copies/swabs - Weekly swabs") +
  coord_fixed() +
  ggtitle("Swabs collected the same day from the same participant")

Comparison of LBP strains in weekly vs daily samples

Code
qpcr_LBP <- 
  bind_rows(
    mae[["qPCR_daily"]] |> 
      as_tibble() |>
      dplyr::filter(.feature != "16S", !is.na(copies_per_swab_med)) |>
      select(uid, dilution, target, copies_per_swab_med) |> 
      mutate(assay = "daily qPCR"),
    mae[["qPCR"]] |> 
      as_tibble() |>
      dplyr::filter(.feature != "16S") |>
      select(uid, dilution, target, copies_per_swab_med) |>
      mutate(assay = "weekly qPCR")
  ) |> 
  dplyr::left_join(
    mae@colData |> 
      as_tibble() |> 
      dplyr::select(
        uid, pid, visit_code, visit, study_day, sample_type, 
        site, randomized, arm, ITT, mITT, PP),
    by = join_by(uid)
    ) |>
  dplyr::filter(sample_type == "Clinical sample")
Code
qpcr_LBP |>
  filter(randomized, study_day > -10) |> 
  ggplot() +
  aes(x = study_day, y = copies_per_swab_med, color = assay) +
  facet_grid(site ~ target, 
    scales = "free_y", space = "free_y", labeller = label_both
    ) +
  geom_line(aes(group = interaction(pid, assay)), alpha = 0.2) +
  geom_point(alpha = 0.5, size = 0.5) +
  scale_y_log10() +
  theme(
    strip.text.y = element_text(angle = 0)
  )

Code
qpcr_LBP |>
  filter(mITT, study_day > -10, site == "MGH") |> 
  ggplot() +
  aes(x = study_day, y = copies_per_swab_med, color = assay) +
  facet_grid(site + pid ~ target, labeller = label_both) +
  geom_line(alpha = 0.2) +
  geom_point(alpha = 0.5, size = 0.5) +
  scale_y_log10() +
  theme(
    strip.text.y = element_text(angle = 0)
  )

Code
qpcr_LBP |>
  filter(mITT, study_day > -10, site == "CAP", arm == "LC-106-o") |> 
  ggplot() +
  aes(x = study_day, y = copies_per_swab_med, color = assay) +
  ggplot2::facet_grid(pid ~ target) + #    , labeller = label_both
  geom_line(alpha = 0.2) +
  geom_point(alpha = 0.5, size = 0.5) +
  scale_y_log10() +
  theme(
    strip.text.y = element_text(angle = 0)
  )
Code
qpcr_LBP |>
  filter(randomized, study_day > -10) |> 
  ggplot() +
  aes(x = assay, y = copies_per_swab_med, color = assay, fill = assay) +
  facet_grid(site ~ target, 
    scales = "free_y", space = "free_y", labeller = label_both
    ) +
  geom_boxplot(alpha = 0.5) +
  scale_y_log10() +
  theme(
    strip.text.y = element_text(angle = 0),
    axis.text.x = element_blank(),
    legend.position = "bottom"
  )

Code
selected_time_points <- 
  qpcr_LBP |> 
  filter(randomized) |> 
  group_by(pid, study_day) |> 
  mutate(n_assays = n_distinct(assay)) |>
  ungroup() |> 
  dplyr::filter(n_assays == 2) 

tmp <- 
 dplyr::full_join(
   selected_time_points |> 
     filter(assay == "weekly qPCR") |> 
     dplyr::select(target, copies_per_swab_med, pid, study_day, site) |> 
     dplyr::rename(copies_per_swab_med_weekly = copies_per_swab_med),
   selected_time_points |> 
     filter(assay == "daily qPCR")  |> 
     dplyr::select(target, copies_per_swab_med, pid, study_day, site) |> 
     distinct() |> 
     dplyr::rename(copies_per_swab_med_daily = copies_per_swab_med)
 )
Code
tmp |> 
  mutate(
    study_phase = case_when(
      study_day <= 0 ~ "pre-MTZ",
      study_day > 0 & study_day <=7 ~ "Week 1",
      study_day <= 15 ~ "Week 2",
      TRUE ~ "Week 3+"
    ) |> factor(levels = c("pre-MTZ", "Week 1", "Week 2", "Week 3+"))
  ) |> 
  ggplot() +
  aes(x = copies_per_swab_med_daily, y = copies_per_swab_med_weekly, col = study_phase) +
  facet_grid(
    site ~ target, labeller = label_both
    ) +
  geom_abline(slope = 1, intercept = 0, color = "black") +
  geom_point(alpha = 0.5, size = 0.5) + # aes(col = study_day), 
  scale_x_log10("Copies/swabs - Daily swabs") +
  scale_y_log10("Copies/swabs - Weekly swabs") +
  # scale_color_gradient(low = "dodgerblue1", high = "red") +
  coord_fixed()

Code
tmp |> 
  ggplot() +
  aes(x = copies_per_swab_med_daily, y = copies_per_swab_med_weekly, col = site) +
  ggh4x::facet_nested_wrap(target + site ~ ., ncol = 10) +
  geom_abline(slope = 1, intercept = 0, color = "black") +
  geom_point(alpha = 0.5, size = 0.5) + # aes(col = study_day), 
  scale_x_log10("Copies/swabs - Daily swabs") +
  scale_y_log10("Copies/swabs - Weekly swabs") +
  scale_color_manual(values = site_colors) +  
  coord_fixed() +
  theme(legend.position = "bottom", axis.text.x = element_text(angle = 45, hjust = 1))

Primary outcomes

Primary outcome definition

The primary outcome is “colonization with any of the L. crispatus strains contained in the LBP by 5 weeks of follow-up as assessed by metagenomic sequencing of the vaginal microbial community with detection of any one of LBP strains at >5% relative abundance or a combination of the strains accounting for >10% relative abundance by metagenomics.”

By metagenomics

“LBP colonization” at each visit

We compute, at each visit and for each participant, the total relative abundance of all LBP strains or the maximum relative abundance of any LBP strain.

If the total relative abundance of all LBP strains is larger than 0.1 or the maximum relative abundance of any LBP strain is larger than 0.05, we consider that the LBP achieved colonization_mg at that visit.

Code
colonization_LBP_mg <- 
  mae[["mg"]] |> 
  as_tibble() |>
  dplyr::left_join(mae@colData |> as_tibble()) |> 
  dplyr::filter(sample_type == "Clinical sample", !poor_quality_mg_data) |> 
  dplyr::filter(!is.na(LBP)) |> 
  group_by(.sample) |> 
  summarize(
    n = n(),
    total_LBP_rel_ab_at = sum(rel_ab),
    max_LBP_rel_ab_at = max(rel_ab),
    n_detected_strains_at = sum(rel_ab > 0),
    n_detected_LC106_strains_at = sum(rel_ab[LBP == "LC106 & LC115"] > 0),
    n_detected_LC115_only_strains_at = sum(rel_ab[LBP == "LC115"] > 0)
  ) |> 
  mutate(
    colonized_LBP_at_mg = (total_LBP_rel_ab_at > 0.1) | (max_LBP_rel_ab_at > 0.05)
  ) 
Code
colonization_LBP_mg <-  
  colonization_LBP_mg |> 
  dplyr::left_join(
    mae@colData |> as_tibble() |> 
      select(uid, pid, visit_code, randomized, arm, location) |> 
      dplyr::rename(.sample = uid), 
    by = join_by(.sample)
  ) |> 
  arrange(pid, visit_code) 
Code
colonization_LBP_mg |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = colonized_LBP_at_mg) +
  geom_tile() +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
    ) +
  theme(    
    strip.text.y = element_text(angle = 0)
  )

“LBP colonization” by each visit

From the colonization_mg status at each visit, we can compute the colonization_mg status by each visit.

A participant is considered to have colonized by a visit if they have been colonized at that visit or any previous visit, starting from their post-product visit (“1200” for all groups).

Since we have some missing visits, we impute these as “not colonized”

Code
colonization_LBP_mg <- 
  colonization_LBP_mg |> 
  select(-c(arm, randomized, location)) |> 
  dplyr::full_join(
    colonization_LBP_mg |> select(pid, arm, randomized, location) |> distinct() |> 
      expand_grid(
        visit_code = unique(colonization_LBP_mg$visit_code) |> sort()
      ),
    by = join_by(pid, visit_code)
  )
Code
colonization_LBP_mg |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = colonized_LBP_at_mg) +
  geom_tile() +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
    ) +
  theme(
    strip.text.y = element_text(angle = 0)
  )

Code
colonization_LBP_mg <- 
  colonization_LBP_mg |> 
  mutate(colonized_LBP_at_mg = colonized_LBP_at_mg |> replace_na(FALSE)) |>
  arrange(pid, visit_code) |>
  group_by(pid) |> 
  mutate(
    tmp = ifelse(as.numeric(visit_code) < 1200, FALSE, colonized_LBP_at_mg),
    colonized_LBP_by_mg = cummax(tmp) |> as.logical()
  ) |> 
  ungroup() |> 
  select(-tmp)
Code
colonization_LBP_mg |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = colonized_LBP_by_mg) +
  geom_tile() +
  annotate(
    geom = "rect", 
    xmin = 10.5, xmax = 11.5, ymin = -Inf, ymax = Inf,
    col = "black", alpha = 0, linewidth = 1
  ) +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
  ) +
  theme(
    strip.text.y = element_text(angle = 0)
  )

L. crispatus colonization” at each visit

We compute, at each visit and for each participant, the total relative abundance of L. crispatus.

If the total relative abundance of L. crispatus strains is larger than 50%, we consider that the participant’s VM was colonized by L. crispatus at that visit.

Code
colonization_crispatus_mg <- 
  mae[["mg"]] |> 
  as_tibble() |>
  dplyr::left_join(mae@colData |> as_tibble()) |>
  dplyr::filter(sample_type == "Clinical sample", !poor_quality_mg_data) |> 
  dplyr::filter(species == "crispatus") |> 
  group_by(.sample) |> 
  summarize(
    total_crispatus_rel_ab_at = sum(rel_ab),
  ) |> 
  mutate(
    crispatus_dominance_at_mg = (total_crispatus_rel_ab_at >= 0.5)
  ) 
Code
colonization_crispatus_mg <-  
  colonization_crispatus_mg |> 
  dplyr::left_join(
    mae@colData |> as_tibble() |> 
      select(uid, pid, visit_code, randomized, arm, location) |> 
      dplyr::rename(.sample = uid), 
    by = join_by(.sample)
  ) |> 
  arrange(pid, visit_code) 
Code
colonization_crispatus_mg |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = crispatus_dominance_at_mg) +
  geom_tile() +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
    ) +
  theme(    
    strip.text.y = element_text(angle = 0)
  )

“Crispatus colonization” by each visit

From the colonization_mg status at each visit, we can compute the colonization_mg status by each visit.

A participant is considered to have colonized by a visit if they have been colonized at that visit or any previous visit, starting from their post-product visit (“1200” for all groups).

Since we have some missing visits, we impute these as “not colonized”

Code
colonization_crispatus_mg <- 
  colonization_crispatus_mg |> 
  select(-c(arm, randomized, location)) |> 
  dplyr::full_join(
    colonization_crispatus_mg |> select(pid, arm, randomized, location) |> distinct() |> 
      expand_grid(
        visit_code = unique(colonization_crispatus_mg$visit_code) |> sort()
      ),
    by = join_by(pid, visit_code)
  )
Code
colonization_crispatus_mg |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = crispatus_dominance_at_mg) +
  geom_tile() +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
    ) +
  theme(
    strip.text.y = element_text(angle = 0)
  )

Code
colonization_crispatus_mg <- 
  colonization_crispatus_mg |> 
  arrange(pid, visit_code) |>
  mutate(crispatus_dominance_at_mg = crispatus_dominance_at_mg |> replace_na(FALSE)) |>
  group_by(pid) |> 
  mutate(
    tmp = ifelse(as.numeric(visit_code) < 1200, FALSE, crispatus_dominance_at_mg),
    crispatus_dominance_by_mg = cummax(tmp) |> as.logical()
  ) |> 
  ungroup() #|> 
  #select(-tmp)
Code
colonization_crispatus_mg |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = crispatus_dominance_by_mg) +
  geom_tile() +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
  ) +
  theme(
    strip.text.y = element_text(angle = 0)
  )

LBP colonization VS crispatus dominance

Code
p1 <- colonization_LBP_mg |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = colonized_LBP_at_mg) +
  geom_tile() +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
    ) +
  theme(
    strip.text.y = element_text(angle = 0)
  )

p2 <- colonization_crispatus_mg |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = crispatus_dominance_at_mg) +
  geom_tile() +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
    ) +
  theme(
    strip.text.y = element_text(angle = 0)
  )

 p1 + p2 + 
    plot_annotation(
      title = str_c("LBP colonization VS crispatus dominance")
    ) +
    plot_layout(ncol = 2) &
    theme(
      legend.justification = "left"
    )

Code
p1 <- colonization_LBP_mg |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = colonized_LBP_by_mg) +
  geom_tile() +
  annotate(
    geom = "rect", 
    xmin = 10.5, xmax = 11.5, ymin = -Inf, ymax = Inf,
    col = "black", alpha = 0, linewidth = 1
  ) +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
  ) +
  theme(
    strip.text.y = element_text(angle = 0)
  )

p2 <- colonization_crispatus_mg |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = crispatus_dominance_by_mg) +
  geom_tile() +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
  ) +
  theme(
    strip.text.y = element_text(angle = 0)
  )

p1 + p2 + 
    plot_annotation(
      title = str_c("LBP colonization VS crispatus dominance")
    ) +
    plot_layout(ncol = 2) &
    theme(
      legend.justification = "left"
    )

By qPCR

“LBP colonization” at each visit

We compute, at each visit and for each participant, the relative abundance of each LBP strain as estimated by qPCR by dividing the copies per swab for that strain by the copies per swab for the 16S rRNA gene.

Then, just as we did for the metagenomics, we compute the total relative abundance of all LBP strains or the maximum relative abundance of any LBP strain.

If the total relative abundance of all LBP strains is larger than 0.1 or the maximum relative abundance of any LBP strain is larger than 0.05, we consider that the LBP achieved colonization at that visit by qPCR.

We also consider an additional secondary outcome, which is that if at least 2 strains are detected at >0, we consider that the participant is colonized by LBP at that visit by qPCR.

Code
colonization_qPCR <- 
  mae[["qPCR"]] |> 
  as_tibble() |>
  left_join(mae@colData |> as_tibble()) |>
  dplyr::filter(sample_type == "Clinical sample") |> 
  dplyr::filter(!is.na(LBP)) |> 
  # we add the 16S copies per swab to compute the relative abundance
  left_join(
    mae[["qPCR"]] |> 
      as_tibble() |>
      left_join(mae@colData |> as_tibble()) |>
      dplyr::filter(sample_type == "Clinical sample") |> 
      dplyr::filter(is.na(LBP), .feature == "16S") |> 
      select(.sample, copies_per_swab_med) |>
      dplyr::rename(copies_per_swab_16S = copies_per_swab_med),
    by = join_by(.sample)
  ) |> 
  mutate(rel_ab = copies_per_swab_med / copies_per_swab_16S) |> 
  select(.feature, .sample, rel_ab, copies_per_swab_med, copies_per_swab_16S, everything()) |> 
  group_by(.sample) |> 
  summarize(
    total_LBP_rel_ab_at = sum(rel_ab),
    max_LBP_rel_ab_at = max(rel_ab),
    n_detected_strains_at = sum(copies_per_swab_med > 0),
    n_detected_LC106_strains_at = sum(copies_per_swab_med[LBP == "LC106 & LC115"] > 0),
    n_detected_LC115_only_strains_at = sum(copies_per_swab_med[LBP == "LC115"] > 0)
  ) |> 
  mutate(
    colonized_LBP_at_qpcr = (total_LBP_rel_ab_at > 0.1) | (max_LBP_rel_ab_at > 0.05),
    LBP_detected_at_qpcr = (n_detected_strains_at > 2)
  ) 
Code
colonization_qPCR <-  
  colonization_qPCR |> 
  dplyr::left_join(
    mae@colData |> as_tibble() |> 
      select(uid, pid, visit_code, randomized, arm, location) |> 
      dplyr::rename(.sample = uid), 
    by = join_by(.sample)
  ) |> 
  arrange(pid, visit_code) 
Code
colonization_qPCR |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = colonized_LBP_at_qpcr) +
  geom_tile() +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
    ) +
  theme(    
    strip.text.y = element_text(angle = 0)
  )

Code
colonization_qPCR |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = LBP_detected_at_qpcr) +
  geom_tile() +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
    ) +
  theme(    
    strip.text.y = element_text(angle = 0)
  )

“LBP colonization” by each visit

From the colonization status at each visit, we can compute the colonization status by each visit.

A participant is considered to have colonized by a visit if they have been colonized at that visit or any previous visit, starting from their post-product visit (“1200” for all groups).

Since we have some missing visits, we impute these as “not colonized”

Code
colonization_qPCR <- 
  colonization_qPCR |> 
  select(-c(arm, randomized, location)) |> 
  dplyr::full_join(
    colonization_qPCR |> select(pid, arm, randomized, location) |> distinct() |> 
      expand_grid(
        visit_code = unique(colonization_qPCR$visit_code) |> sort()
      ),
    by = join_by(pid, visit_code)
  )
Code
colonization_qPCR |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = colonized_LBP_at_qpcr) +
  geom_tile() +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
    ) +
  theme(    
    strip.text.y = element_text(angle = 0)
  )

Code
colonization_qPCR |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = LBP_detected_at_qpcr) +
  geom_tile() +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
    ) +
  theme(
    strip.text.y = element_text(angle = 0)
  )

Code
colonization_qPCR <- 
  colonization_qPCR |> 
  arrange(pid, visit_code) |>
  mutate(
    colonized_LBP_at_qpcr = colonized_LBP_at_qpcr |> replace_na(FALSE),
    LBP_detected_at_qpcr = LBP_detected_at_qpcr |> replace_na(FALSE)
    ) |>
  group_by(pid) |> 
  mutate(
    tmp = ifelse(as.numeric(visit_code) < 1200, FALSE, colonized_LBP_at_qpcr),
    colonized_LBP_by_qpcr = cummax(tmp) |> as.logical(),
    tmp = ifelse(as.numeric(visit_code) < 1200, FALSE, LBP_detected_at_qpcr),
    LBP_detected_by_qpcr = cummax(tmp) |> as.logical()
  ) |> 
  ungroup() |> 
  select(-tmp)
Code
colonization_qPCR |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = colonized_LBP_by_qpcr) +
  geom_tile() +
  annotate(
    geom = "rect", 
    xmin = 10.5, xmax = 11.5, ymin = -Inf, ymax = Inf,
    col = "black", alpha = 0, linewidth = 1
  ) +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
  ) +
  theme(
    strip.text.y = element_text(angle = 0)
  )

Code
colonization_qPCR |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = LBP_detected_by_qpcr) +
  geom_tile() +
  annotate(
    geom = "rect", 
    xmin = 10.5, xmax = 11.5, ymin = -Inf, ymax = Inf,
    col = "black", alpha = 0, linewidth = 1
  ) +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
  ) +
  theme(
    strip.text.y = element_text(angle = 0)
  )

Colonization qPCR VS metagenomics

Code
p1 <- colonization_LBP_mg |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = colonized_LBP_at_mg) +
  geom_tile() +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
    ) +
  theme(
    strip.text.y = element_text(angle = 0)
  )

p2 <- colonization_qPCR |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = colonized_LBP_at_qpcr) +
  geom_tile() +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
    ) +
  theme(
    strip.text.y = element_text(angle = 0)
  )

p1 + p2 + 
    plot_annotation(
      title = str_c("LBP colonization *at* metagenomics VS qPCR")
    ) +
    plot_layout(ncol = 2) &
    theme(
      legend.justification = "left"
    )

Code
p1 <- colonization_LBP_mg |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = colonized_LBP_by_mg) +
  geom_tile() +
  annotate(
    geom = "rect", 
    xmin = 10.5, xmax = 11.5, ymin = -Inf, ymax = Inf,
    col = "black", alpha = 0, linewidth = 1
  ) +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
  ) +
  theme(
    strip.text.y = element_text(angle = 0)
  )


p2 <- colonization_qPCR |> 
  ggplot() +
  aes(x = visit_code, y = pid, fill = colonized_LBP_by_qpcr) +
  geom_tile() +
  annotate(
    geom = "rect", 
    xmin = 10.5, xmax = 11.5, ymin = -Inf, ymax = Inf,
    col = "black", alpha = 0, linewidth = 1
  ) +
  facet_grid(
    randomized + arm ~ ., 
    scales = "free_y", space = "free_y", labeller = label_both
  ) +
  theme(
    strip.text.y = element_text(angle = 0)
  )

p1 + p2 + 
    plot_annotation(
      title = str_c("LBP colonization by metagenomics VS qPCR")
    ) +
    plot_layout(ncol = 2) &
    theme(
      legend.justification = "left"
    )

Adding primary_outcomes assay to MAE

Code
se_primary_col_LBP_mg <-
  SummarizedExperiment(
    assays = list(
      outcome = 
        colonization_LBP_mg |> 
        # filter(!is.na(pid)) |> 
        # mutate(.sample = str_c(pid, "_", visit_code)) |> 
        dplyr::filter(!is.na(.sample)) |> 
        select(.sample, colonized_LBP_at_mg, colonized_LBP_by_mg) |> 
        as.data.frame() |> 
        column_to_rownames(".sample") |> t()
  ) 
)

se_primary_col_crisp_mg <-
  SummarizedExperiment(
    assays = list(
      outcome = 
        colonization_crispatus_mg |> 
        dplyr::filter(!is.na(.sample)) |> 
        select(.sample, crispatus_dominance_at_mg, crispatus_dominance_by_mg) |> 
        as.data.frame() |> 
        column_to_rownames(".sample") |> t()
  ) 
)

se_primary_col_LBP_qPCR <-
  SummarizedExperiment(
    assays = list(
      outcome = 
        colonization_qPCR |> 
        dplyr::filter(!is.na(.sample)) |> 
        select(.sample, colonized_LBP_at_qpcr, colonized_LBP_by_qpcr, LBP_detected_at_qpcr, LBP_detected_by_qpcr) |> 
        as.data.frame() |> 
        column_to_rownames(".sample") |> t()
    )
  ) 


mae <- c(
  mae, 
  list(col_LBP_mg = se_primary_col_LBP_mg, 
       col_crispatus_mg = se_primary_col_crisp_mg,
       col_LBP_qPCR = se_primary_col_LBP_qPCR)
)

Calculated variables

Code
mae@metadata$calc_vars_pid <- 
  tibble(pid = mae@colData |> as_tibble() |> pull(pid) |> unique())

Education - can we harmonize between sites ?

Code
education <- 
  bind_rows(
    mae@metadata$crf_data_clean$crf1 |> 
      select(uid, pid, visit_code, edu),
    mae@metadata$crf_data_clean$crf101 |> 
      select(uid, pid, visit_code, edu)
  ) |> 
  left_join(
    mae@colData |> as_tibble() |> 
      select(pid, site, randomized) |> 
      distinct(),
    by = join_by(pid)
  )
Code
education |> 
  ggplot() +
  aes(y = edu, fill = randomized) +
  facet_wrap(. ~ site, scales = "free") +
  geom_bar() +
  xlab("Number of participants") +
  ylab("") +
  scale_fill_manual(
    name = "",
    values = c("TRUE" = "dodgerblue1", "FALSE" = "gray70"),
    labels = c("TRUE" = "Randomized", "FALSE" = "Not randomized")
  ) +
  theme(legend.position = "top")

Code
education |> 
  filter(randomized) |> 
  ggplot() +
  aes(y = edu) +
  facet_wrap(. ~ site, scales = "free") +
  geom_bar(fill = "dodgerblue1") +
  xlab("Number of participants") +
  ylab("")

Economic worry

Code
worry <- 
   bind_rows(
    mae@metadata$crf_data_clean$crf2 |> 
      select(uid, pid, visit_code, starts_with("worry_")),
    mae@metadata$crf_data_clean$crf102 |> 
      select(uid, pid, visit_code, starts_with("worry_"))
  ) |> 
  left_join(
    mae@colData |> as_tibble() |> 
      select(pid, site, randomized) |> 
      distinct(),
    by = join_by(pid)
  )
Code
# create an upset plot to see the combinations of worries
library(UpSetR)
# create a binary matrix for the worries
worry_matrix_CAP <-
  worry |>
  filter(randomized, site == "CAP", !is.na(worry_food)) |>
  select(starts_with("worry_")) |>
  # remove "worry_" prefix from column names
  rename_with(~ str_remove_all(., "worry_")) |>
  mutate(across(everything(), ~ ifelse(. == "Checked", 1, 0))) |>
  as.data.frame()

upset(
  worry_matrix_CAP,
  nsets = ncol(worry_matrix_CAP),
  nintersects = NA,
  keep.order = TRUE,
  order.by = "freq",
  #text.scale = c(2, 2, 2, 1.5, 2, 2),
  main.bar.color = "dodgerblue1",
  sets.bar.color = "gray70",
  set_size.show = TRUE
  )

Code
worry_matrix_MGH <-
  worry |>
  filter(randomized, site == "MGH", !is.na(worry_food)) |>
  select(starts_with("worry_")) |>
  # remove "worry_" prefix from column names
  rename_with(~ str_remove_all(., "worry_")) |>
  mutate(across(everything(), ~ ifelse(. == "Checked", 1, 0))) |>
  as.data.frame()

upset(
  worry_matrix_MGH,
  nsets = ncol(worry_matrix_MGH),
  nintersects = NA,
  keep.order = TRUE,
  order.by = "freq",
  #text.scale = c(2, 2, 2, 1.5, 2, 2),
  main.bar.color = "dodgerblue1",
  # sets.bar.color = "gray70",
  set_size.show = TRUE
)

Code
table1_data <- 
  metadata(mae)$participant_crfs_merged |> 
  dplyr::left_join(
    colData(mae) |> as_tibble() |> select(pid, site, arm, ITT, mITT, PP) |> distinct(), 
    by = join_by(pid, site)
    )  |> 
  filter(mITT) |> 
  select(pid, site, arm, cut_size_meals, eat_less, hungry_did_not_eat) |> 
  mutate(
    food_insecurity = ifelse(
      cut_size_meals == "Yes" | eat_less == "Yes" | hungry_did_not_eat == "Yes",
      "Yes",
      "No"
    ),
    food_insecurity = factor(food_insecurity, levels = c("No", "Yes"))
  ) |> 
  select(-c(eat_less, cut_size_meals, hungry_did_not_eat))
Code
table1_data |> 
  left_join(
    worry |> select(pid, worry_food) |> distinct(),
    by = join_by(pid)
  ) |> 
  dplyr::count(site, food_insecurity, worry_food) |> 
  ggplot() +
  aes(x = food_insecurity, y = worry_food, fill = n) +
  geom_tile() +
  geom_text(aes(label = n)) +
  facet_wrap(. ~ site, scales = "free") +
  scale_fill_gradient(low = "white", high = "dodgerblue1") 

Pre-existing conditions

Code
pre_existing_conditions <- 
  mae@metadata$crf_data_clean$crf5 |> 
  select(pid, no_pre_existing_condition, diagnosis) |> 
  left_join(
    mae@colData |> as_tibble() |> 
      select(pid, site, randomized) |> 
      distinct(),
    by = join_by(pid)
  )
Code
pre_existing_conditions |> 
  filter(randomized) |> 
  dplyr::mutate(
    diagnosis = 
      case_when(
        no_pre_existing_condition ~ "No pre-existing condition",
        TRUE ~ diagnosis
        )
    ) |> 
  select(pid, diagnosis, site) |> 
  distinct() |> 
  dplyr::count(site, diagnosis) |> 
  arrange(-n) |> 
  mutate(diagnosis = diagnosis |> fct_inorder()) |>
  ggplot() +
  aes(x = n, y = diagnosis) +
  facet_grid(. ~ site, scales = "free_x") +
  geom_col()

Psychological wellbeing

Code
psych_wellbeing <- 
  mae@metadata$crf_data_clean$crf11 |> 
  select(pid, upset:difficulties_piling_up) |>
  distinct() |> 
  left_join(
    mae@colData |> as_tibble() |> 
      select(pid, site, randomized) |> 
      distinct(),
    by = join_by(pid)
  )

psych_wellbeing_long <- 
  psych_wellbeing |> 
  filter(randomized) |>
  filter(!is.na(upset)) |>
  pivot_longer(
    cols = upset:difficulties_piling_up,
    names_to = "question",
    values_to = "answer"
  )
Code
psych_wellbeing_long |> 
  group_by(site, pid) |> 
  mutate(total_score = sum(answer |> as.integer())) |>
  ungroup() |> 
  arrange(total_score) |> 
  mutate(
    pid = pid |> fct_inorder()
  ) |>
  ggplot() +
  aes(y = pid, x = question, fill = answer) +
  facet_grid(site ~ ., scales = "free_y", space = "free") +
  geom_tile() +
  scale_fill_brewer(type = "div", palette = "PRGn") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    strip.text.y = element_text(angle = 0)
  )

Sexual behavior

PSA

Code
psa <- 
  mae@metadata$crf_data_clean$crf45 |> 
  select(uid, pid, visit_code, psa_results) |> 
  left_join(
    mae@colData |> as_tibble() |> 
      select(pid, site, randomized) |> 
      distinct(),
    by = join_by(pid)
  )
Code
psa |> 
  ggplot() +
  aes(x = visit_code, y = pid) +
  geom_tile(fill = "lightseagreen")

Metronidazole use

Code
mtz_crf_raw <- 
  mae@metadata$crf_data_raw$crf22 |> 
  left_join(
    mae@colData |> as_tibble() |> 
      select(pid, site, randomized) |> 
      distinct(),
    by = join_by(pid)
  )

mtz_crf_raw |> 
  dplyr::count(dfseq)
# A tibble: 3 × 2
  dfseq     n
  <dbl> <int>
1  1100    94
2  1200    54
3  1300     2
Code
mtz_crf_raw |> 
  filter(randomized) |> 
  dplyr::count(dfseq)
# A tibble: 3 × 2
  dfseq     n
  <dbl> <int>
1  1100    94
2  1200    54
3  1300     2
Code
mtz_crf_raw |> 
  filter(randomized) |> 
  dplyr::filter(dfseq == 1200) 
# A tibble: 54 × 13
   dfraster     pid       dfseq study_day study_day_original uid      visit_code
   <chr>        <chr>     <dbl>     <dbl>              <dbl> <chr>    <chr>     
 1 2416/007F001 068100010  1200        10                 10 0681000… 1200      
 2 2418/00GC001 068100011  1200        15                 15 0681000… 1200      
 3 2410/00K1001 068200008  1200        15                 15 0682000… 1200      
 4 2409/00L4001 068200020  1200        17                 17 0682000… 1200      
 5 2410/0051001 068200023  1200        15                 15 0682000… 1200      
 6 2412/00HV001 068200028  1200        15                 15 0682000… 1200      
 7 2414/0042001 068200044  1200        15                 15 0682000… 1200      
 8 2417/00CN032 068200050  1200        16                 16 0682000… 1200      
 9 2417/00SD001 068200061  1200        15                 15 0682000… 1200      
10 2421/00CW001 068200118  1200        15                 15 0682001… 1200      
# ℹ 44 more rows
# ℹ 6 more variables: vdate_fixed <lgl>, prescription_container <chr>,
#   tablets_remain <int>, metro_dose_miss <chr>, site <chr>, randomized <lgl>
Code
mtz_crf_raw |> 
  filter(randomized) |> 
  dplyr::count(pid) |> 
  arrange(-n) |> 
  dplyr::count(n)
# A tibble: 2 × 2
      n    nn
  <int> <int>
1     1    40
2     2    55
Code
mtz_crf_raw |> 
  group_by(pid) |> 
  mutate(min_study_day = min(study_day)) |> 
  ungroup() |> 
  mutate(pid = pid |> fct_reorder(min_study_day)) |>
  ggplot() +
  aes(x = study_day, y = pid, col = dfseq |> as.character()) +
  geom_point()

Code
mae@metadata$exposures |> 
  ggplot() +
  aes(x = study_day, y = pid, col = metronidazole |> factor()) + geom_point() +
  scale_color_manual(
    name = "Metronidazole use",
    na.value = "lightgray",
    values = c("0" = "red", "1" = "purple", "2" = "blue")
  ) +
  geom_point()

Saving the MultiAssayExperiment objects

We save the MAE for downstream analyses.

Code
saveRDS(
  mae, 
  str_c(
    get_03_output_dir(),  
    "mae_full_", today() |> str_remove_all("-"), ".rds"
  )
)